Motivación y datasets empleados
El objetivo de nuestro trabajo es estudiar si existe alguna relación entre la vacuna BCG (Bacillus de Calmette y Guérin) para la tuberculosis y los datos de mortalidad de la COVID-19 en algunos países, ya que se hay estudios que sugieren esta vacuna incrementa las capacidades inmunitarias de la población, hecho que se ve en el número reducido de fallecimientos por COVID-19 en ciertos países. Mediante los conjuntos de datos de BCG y de mortalidad por COVID-19 cedidos por The BCG world atlas y por BCG - COVID-19 AI Challenge de Kaggle, vamos a intentar desvelar dichas relaciones.
# Cargamos ambos datasets. Añadir explicación de qué contienen.
BCG_strain <-
read_csv("task_2-BCG_strain_per_country-1Nov2020.csv")
COVID_noformat <-
read_csv(
"task_2-COVID-19-death_cases_per_country_after_fifth_death-till_22_September_2020.csv"
)
# Intenté ver que hay dentro de los data frames, pero el print es feo así que lo escribiré a mano
# str(COVID_noformat)
# str(BCG_strain)
El contenido de las variables BCG_strain y COVID_noformat es el siguiente:
| country_name |
country_name |
| country_code |
alpha_3_code |
| mandatory_bcg_strain_2015-2020 |
date_first_death |
| mandatory_bcg_strain_2010-2015 |
date_fifth_death |
| mandatory_bcg_strain_2005-2010 |
deaths_per_million_10_days_after_fifth_death |
| mandatory_bcg_strain_2000-2005 |
deaths_per_million_15_days_after_fifth_death |
| mandatory_bcg_strain_1990-2000 |
deaths_per_million_20_days_after_fifth_death |
| mandatory_bcg_strain_1980-1990 |
deaths_per_million_25_days_after_fifth_death |
| mandatory_bcg_strain_1970-1980 |
deaths_per_million_30_days_after_fifth_death |
| mandatory_bcg_strain_1960-1970 |
deaths_per_million_35_days_after_fifth_death |
| mandatory_bcg_strain_1950-1960 |
deaths_per_million_40_days_after_fifth_death |
| vaccination_timing_unified |
deaths_per_million_45_days_after_fifth_death |
| BCG Atlas: Which year was vaccination introduced? |
deaths_per_million_50_days_after_fifth_death |
| Year of changes to BCG schedule |
deaths_per_million_55_days_after_fifth_death |
| BCG Atlas: BCG Recommendation Type |
deaths_per_million_60_days_after_fifth_death |
| BCG Atlas: Details of changes |
deaths_per_million_65_days_after_fifth_death |
| BCG Atlas: Timing of 1st BCG? |
deaths_per_million_70_days_after_fifth_death |
| BCG Atlas: BCG Strain |
deaths_per_million_75_days_after_fifth_death |
| BCG Atlas: How long has this BCG vaccine strain been used? |
deaths_per_million_80_days_after_fifth_death |
|
deaths_per_million_85_days_after_fifth_death |
|
deaths_per_million_90_days_after_fifth_death |
|
deaths_per_million_95_days_after_fifth_death |
|
deaths_per_million_100_days_after_fifth_death |
|
deaths_per_million_105_days_after_fifth_death |
|
deaths_per_million_110_days_after_fifth_death |
|
deaths_per_million_115_days_after_fifth_death |
|
deaths_per_million_120_days_after_fifth_death |
|
deaths_per_million_125_days_after_fifth_death |
|
deaths_per_million_130_days_after_fifth_death |
|
deaths_per_million_135_days_after_fifth_death |
|
deaths_per_million_140_days_after_fifth_death |
|
deaths_per_million_145_days_after_fifth_death |
|
deaths_per_million_150_days_after_fifth_death |
|
stringency_index_10_days_after_fifth_death |
|
stringency_index_15_days_after_fifth_death |
|
stringency_index_20_days_after_fifth_death |
|
stringency_index_25_days_after_fifth_death |
|
stringency_index_30_days_after_fifth_death |
|
stringency_index_35_days_after_fifth_death |
|
stringency_index_40_days_after_fifth_death |
|
stringency_index_45_days_after_fifth_death |
|
stringency_index_50_days_after_fifth_death |
|
stringency_index_55_days_after_fifth_death |
|
stringency_index_60_days_after_fifth_death |
|
stringency_index_65_days_after_fifth_death |
|
stringency_index_70_days_after_fifth_death |
|
stringency_index_75_days_after_fifth_death |
|
stringency_index_80_days_after_fifth_death |
|
stringency_index_85_days_after_fifth_death |
|
stringency_index_90_days_after_fifth_death |
|
stringency_index_95_days_after_fifth_death |
|
stringency_index_100_days_after_fifth_death |
|
stringency_index_105_days_after_fifth_death |
|
stringency_index_110_days_after_fifth_death |
|
stringency_index_115_days_after_fifth_death |
|
stringency_index_120_days_after_fifth_death |
|
stringency_index_125_days_after_fifth_death |
|
stringency_index_130_days_after_fifth_death |
|
stringency_index_135_days_after_fifth_death |
|
stringency_index_140_days_after_fifth_death |
|
stringency_index_145_days_after_fifth_death |
|
stringency_index_150_days_after_fifth_death |
Una visualización preliminar de estos datos revela que son todos del tipo string y que además muchas columnas sin datos (columnas cuyo único contenido es NULL), por lo tanto llevaremos a cabo una limpieza de los mismos y además llevaremos a cabo un cambio de tipo de variable para que las manipulaciones posteriores sean más cómodas.
# Limpiar datos de BCG
# Elimino columnas que sean sólo NA
BCG_strain <- BCG_strain[, apply(!is.na(BCG_strain), 2, all)]
# De momento, no me interesa qué vacunas se ponían cada año, sino si se ponían o no.
# Transformo los valores de cada año en
# 0 - No se ponía vacuna, hasta ahora None
# 1 - Sí se ponía vacuna
# NA - Este dato es desconocido, hasta ahora Unknown
BCG_strain_no_strain <- BCG_strain
# Transformo los valores de las columnas
BCG_strain_no_strain[, -1] <-
sapply(BCG_strain_no_strain[, -1], function(x) {
a <-
gsub("None", 0, x) %>% gsub("Unknown", NA, .) # Añado los 0 y los NA.
for (i in 1:length(a)) {
# Serán 1 aquellos que no sean ni 0 ni NA
if (a[i] != "0" && !is.na(a[i])) {
a[i] <- 1
}
}
return(as.integer(a)) # Cambio las columnas a integer
})
####################################################################################
# Limpiar datos de COVID
# Elimino columnas que sean sólo NA
COVID_noNA <- COVID_noformat[, apply(!is.na(COVID_noformat), 2, all)]
# En este caso, para variar, los valores vacíos están denotados como NULL,
# cambio esto a NA
COVID_Na <- sapply(COVID_noNA, function(x)
gsub("NULL", NA, x))
# El resulatado de la función anterior es una string. Lo convierto a dataframe.
COVID_Na_df <- as.data.frame(COVID_Na)
# Modifico las fechas para que se almacenen como Date
COVID_Na_df[, c("date_fifth_death")] <-
as.Date(COVID_Na_df[, c("date_fifth_death")], "%d/%m/%y")
COVID_Na_df[, c("date_first_death")] <-
as.Date(COVID_Na_df[, c("date_first_death")], "%d/%m/%y")
# Modifico las muertes para que se almacenen como floats.
COVID_Na_df[, -c(1, 2, 3, 4)] <-
sapply(COVID_Na_df[, -c(1, 2, 3, 4)], as.numeric)
################################################################################################
# Junto ambos dataframes en uno sólo.
COVID_BGC <-
left_join(BCG_strain_no_strain, COVID_Na_df, by = "country_name")
# Reduzco los colnames, son my largos
colnames(COVID_BGC) <-
gsub("mandatory_bcg_strain_", "strain", colnames(COVID_BGC)) %>%
gsub("deaths_per_million", "dpm", .) %>%
gsub("days_after_fifth_death", "d", .) %>%
gsub("stringency_index", "si", .)
Nuestra tabla resultante es la siguiente:
Tabla 1. Vacunación de BCG por países y muertes por COVID-19
| Afghanistan |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
AFG |
| Albania |
|
|
|
|
|
|
|
0 |
0 |
ALB |
| Algeria |
1 |
|
|
|
|
|
|
0 |
0 |
DZA |
| Angola |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
AGO |
| Argentina |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
ARG |
| Armenia |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
ARM |
Tabla 1. Vacunación de BCG por países y muertes por COVID-19
| Afghanistan |
2020-03-24 |
2020-04-05 |
48 |
47 |
41 |
44 |
79 |
80 |
102 |
118 |
| Albania |
2020-03-12 |
2020-03-25 |
155 |
161 |
159 |
169 |
168 |
41 |
35 |
33 |
| Algeria |
2020-03-13 |
2020-03-18 |
49 |
66 |
125 |
153 |
161 |
167 |
167 |
32 |
| Angola |
2020-03-30 |
2020-06-12 |
21 |
17 |
20 |
20 |
21 |
19 |
15 |
19 |
| Argentina |
2020-03-08 |
2020-03-25 |
59 |
68 |
91 |
91 |
116 |
135 |
142 |
153 |
| Armenia |
2020-03-27 |
2020-04-03 |
144 |
155 |
162 |
56 |
49 |
55 |
53 |
71 |
Tabla 1. Vacunación de BCG por países y muertes por COVID-19
| Afghanistan |
134 |
146 |
155 |
165 |
19 |
30 |
37 |
43 |
51 |
58 |
| Albania |
32 |
25 |
23 |
25 |
18 |
22 |
22 |
24 |
26 |
39 |
| Algeria |
33 |
30 |
27 |
31 |
31 |
32 |
36 |
39 |
36 |
41 |
| Angola |
22 |
22 |
56 |
59 |
55 |
78 |
77 |
81 |
79 |
98 |
| Argentina |
160 |
166 |
167 |
24 |
21 |
31 |
42 |
57 |
57 |
65 |
| Armenia |
75 |
109 |
132 |
146 |
161 |
19 |
25 |
28 |
29 |
33 |
Tabla 1. Vacunación de BCG por países y muertes por COVID-19
| Afghanistan |
54 |
64 |
67 |
74 |
72 |
70 |
71 |
74 |
75 |
77 |
| Albania |
51 |
57 |
71 |
84 |
96 |
106 |
104 |
122 |
126 |
128 |
| Algeria |
44 |
45 |
46 |
53 |
50 |
54 |
55 |
61 |
63 |
69 |
| Angola |
99 |
|
|
|
|
|
|
|
|
|
| Argentina |
79 |
83 |
97 |
106 |
111 |
123 |
129 |
141 |
10 |
25 |
| Armenia |
37 |
46 |
48 |
57 |
56 |
55 |
57 |
62 |
65 |
67 |
Tabla 1. Vacunación de BCG por países y muertes por COVID-19
| Afghanistan |
80 |
52 |
47 |
42 |
45 |
45 |
47 |
48 |
56 |
56 |
| Albania |
132 |
52 |
47 |
42 |
54 |
53 |
53 |
55 |
62 |
62 |
| Algeria |
73 |
39 |
50 |
46 |
49 |
56 |
56 |
35 |
41 |
43 |
| Angola |
|
40 |
34 |
32 |
35 |
35 |
35 |
34 |
40 |
47 |
| Argentina |
32 |
1 |
1 |
1 |
1 |
1 |
52 |
54 |
61 |
63 |
| Armenia |
70 |
|
|
|
|
|
|
|
|
|
Tabla 1. Vacunación de BCG por países y muertes por COVID-19
| Afghanistan |
62 |
67 |
60 |
62 |
61 |
61 |
64 |
69 |
64 |
68 |
| Albania |
60 |
66 |
71 |
41 |
41 |
41 |
44 |
45 |
42 |
46 |
| Algeria |
66 |
54 |
56 |
59 |
57 |
57 |
60 |
46 |
43 |
48 |
| Angola |
53 |
59 |
61 |
58 |
56 |
56 |
68 |
72 |
66 |
|
| Argentina |
69 |
74 |
76 |
77 |
71 |
71 |
76 |
79 |
73 |
76 |
| Armenia |
|
|
|
|
|
|
|
|
|
|
Tabla 1. Vacunación de BCG por países y muertes por COVID-19
| Afghanistan |
67 |
63 |
66 |
60 |
52 |
53 |
27 |
30 |
5 |
3 |
| Albania |
41 |
42 |
42 |
37 |
36 |
36 |
31 |
35 |
36 |
35 |
| Algeria |
49 |
48 |
60 |
56 |
57 |
56 |
66 |
68 |
73 |
53 |
| Angola |
|
|
|
|
|
|
|
|
|
|
| Argentina |
76 |
72 |
77 |
71 |
72 |
72 |
75 |
79 |
77 |
77 |
| Armenia |
|
|
|
|
|
|
|
|
|
|
cormat <-
cor(COVID_BGC %>% select(
-c(
"country_name",
"alpha_3_code",
"date_first_death",
"date_fifth_death",
43:71
)
) %>% na.omit())
cormat2 <- cormat
cormat2[upper.tri(cormat2)] <-
NA #Para visualizar solamente una vez las correlaciones
cormat2 <- melt(round(cormat2, 2)) #Formato para poder usar ggplot
ggplot(cormat2, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + scale_fill_continuous(type = "viridis")

fig <-
plot_ly(
x = colnames(cormat),
y = colnames(cormat),
z = cormat,
type = "heatmap"
)
fig
ggplot(COVID_BGC,
aes(x = dpm_50_d, y = `strain2005-2010`, label = country_name)) +
geom_jitter(position = position_jitter(seed = 1)) +
geom_label_repel(size = 2, position = position_jitter(seed = 1)) +
xlim(c(-100, 800))
